Credit Card Fraud Detection
Source: https://www.kaggle.com/mlg-ulb/creditcardfraud
Data: The dataset has been collected and analysed during a research collaboration of Worldline and the Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Université Libre de Bruxelles) on big data mining and fraud detection.
Download creditcard.csv from Kaggle.
It is important that credit card companies are able to recognize fraudulent credit card transactions so that customers are not charged for items that they did not purchase.
The dataset contains transactions made by credit cards in September 2013 by european cardholders.
This dataset presents transactions that occurred in two days
Classify fraudulent and non-fraudulent credit card transactions.
Data file's information:
There are two different classes a credit card transaction can be classified into => classification problem
Objective: Predict which of the two classes the datapoints belong to.
Constraints:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import keras
from sklearn.linear_model import SGDClassifier
from collections import Counter
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import precision_score, recall_score, accuracy_score, fbeta_score
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, roc_curve
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.manifold import TSNE
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import parfit.parfit as pf
from sklearn.manifold import SpectralEmbedding
from sklearn.decomposition import PCA, KernelPCA, FastICA
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from sklearn.preprocessing import RobustScaler
from pandas_profiling import ProfileReport
from tensorflow import set_random_seed
from keras.layers import Dense, BatchNormalization
from keras.initializers import he_normal
from keras.layers import Dropout
from keras.models import Sequential
from keras.utils.vis_utils import model_to_dot
from keras.utils import plot_model
from IPython.display import SVG
import xgboost as xgb
import scipy.stats as ss
import matplotlib
np.random.seed(2020)
set_random_seed(2020)
matplotlib.use('nbagg')
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
dataset = pd.read_csv('creditcard.csv')
dataset.shape
dataset.columns
dataset.head(5)
dataset.describe()
dataset.isnull().values.any()
dataset.info()
print(dataset['Class'].value_counts())
print('-'*56)
sns.countplot(dataset['Class'])
plt.show()
y_value_counts = dataset['Class'].value_counts()
print("Number of Safe transactions ", y_value_counts[0], ", (", (y_value_counts[0]/(y_value_counts[0]+y_value_counts[1]))*100,"%)")
print("Number of fraud transactions", y_value_counts[1], ", (", (y_value_counts[1]/(y_value_counts[0]+y_value_counts[1]))*100,"%)")
print('-'*80)
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(aspect="equal"))
recipe = ["Safe transactions", "Fraud transactions"]
data = [y_value_counts[0], y_value_counts[1]]
ingredients = [x for x in recipe]
def func(pct, allvals):
absolute = int(pct/100.*np.sum(allvals))
return "{:.1f}%\n({:d})".format(pct, absolute)
wedges, texts, autotexts = ax.pie(data, autopct=lambda pct: func(pct, data),
textprops=dict(color="w"))
ax.legend(wedges, ingredients,
title = "Safe and fraud transactions",
loc="center left",
bbox_to_anchor=(1, 0, 0.5, 1))
plt.setp(autotexts, size=14, weight="bold")
ax.set_title("Frauds Detected in credit card transactions", weight='bold')
plt.show()
fraud = dataset[dataset['Class'] == 1]
non_fraud = dataset[dataset['Class'] == 0]
plt.figure(figsize=(20,8))
plt.subplot(1, 2, 1)
plt.title('Histogram of Time for non-fraudulent samples')
sns.distplot(non_fraud["Time"])
plt.subplot(1, 2, 2)
plt.title('Histogram of Time for fraudulent samples')
sns.distplot(fraud["Time"])
as we can see the time feature doesnt yield any important information as there is not much difference in time for fradulent and non fradulent transactions so we can ignore the time feature
plt.figure(figsize=(20,8))
plt.subplot(1, 2, 1)
plt.title('Histogram of Amount for non-fraudulent samples')
sns.distplot(non_fraud["Amount"])
plt.subplot(1, 2, 2)
plt.title('Histogram of Amount for fraudulent samples')
sns.distplot(fraud["Amount"])
plt.figure(figsize=(20,8))
plt.subplot(1, 2, 1)
plt.title('Boxplot of Time feature')
sns.boxplot(x='Class', y='Time', data=dataset)
plt.subplot(1, 2, 2)
plt.title('Boxplot of Amount feature')
sns.boxplot(x='Class', y='Amount', data=dataset)
import matplotlib.gridspec as gridspec
columns = dataset.iloc[:,1:29].columns
grid = gridspec.GridSpec(14, 2)
plt.figure(figsize=(15,20*4))
frauds = dataset.Class == 1
normals = dataset.Class == 0
for n, col in enumerate(dataset[columns]):
ax = plt.subplot(grid[n])
sns.distplot(dataset[col][frauds], bins = 50) #Will receive the "semi-salmon" violin
sns.distplot(dataset[col][normals], bins = 50) #Will receive the "ocean" color
ax.set_ylabel('Density')
ax.set_title(str(col))
ax.set_xlabel('')
plt.show()
#Looking the Amount and time distribuition of FRAUD transactions
plt.figure(figsize=(20,8))
ax = sns.lmplot(y="Amount", x="Time", fit_reg=False,aspect=1.8,
data=dataset, hue='Class')
plt.title("(Amount vs Time in Minutes) of Fraud and Normal Transactions",fontsize=16)
plt.show()
fig, (axis_1, axis_2) = plt.subplots(2, 1, sharex=True, figsize=(18,12))
axis_1.scatter(dataset['Time'][dataset['Class'] == 1], dataset['Amount'][dataset['Class'] == 1])
axis_1.set_title('Fraud')
axis_2.scatter(dataset['Time'][dataset['Class'] == 0], dataset['Amount'][dataset['Class'] == 0])
axis_2.set_title('Non_Fraud')
plt.xlabel('Time (in Seconds)')
plt.ylabel('Amount')
plt.show()
from sklearn.preprocessing import StandardScaler
cols = dataset.columns
std = StandardScaler()
X_std = std.fit_transform(dataset[cols].iloc[:,range(len(dataset.columns))].values)
cov_mat =np.cov(X_std.T)
plt.figure(figsize=(20,16))
sns.set(font_scale=1.5)
hm = sns.heatmap(cov_mat, annot=True, square=True, fmt='.2f', annot_kws={'size': 12}, cmap='coolwarm',
yticklabels=cols, xticklabels=cols)
plt.title('Covariance matrix showing correlation coefficients for the given data', size = 18)
plt.tight_layout()
plt.show()
# This function plots the confusion matrices given y_i, y_i_hat.
def plot_confusion_matrix(test_y, predict_y):
C = confusion_matrix(test_y, predict_y)
A =(((C.T)/(C.sum(axis=1))).T)
B =(C/C.sum(axis=0))
plt.figure(figsize=(20,4))
labels = [1,2]
# representing A in heatmap format
cmap=sns.light_palette("blue")
plt.subplot(1, 3, 1)
sns.heatmap(C, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.title("Confusion matrix")
plt.subplot(1, 3, 2)
sns.heatmap(B, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.title("Precision matrix")
plt.subplot(1, 3, 3)
# representing B in heatmap format
sns.heatmap(A, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.title("Recall matrix")
print('Accuracy: ', accuracy_score(test_y,predict_y))
print('Recall: ', recall_score(test_y,predict_y))
print('Precision: ', precision_score(test_y,predict_y))
print('F1 score: ', f1_score(test_y, predict_y))
plt.show()
def plot_precison_recall_curve(model,x_test_data, y_test_data):
# Compute predicted probabilities: y_pred_prob
y_pred_prob = model.predict_proba(x_test_data)[:,1]
# Generate precision recall curve values: precision, recall, thresholds
precision, recall, thresholds = precision_recall_curve(y_test_data, y_pred_prob)
print('Area under precision recall curve: ', auc(recall, precision))
# Plot ROC curve
plt.plot(precision, recall)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision Recall Curve')
plt.show()
def plot_roc_auc_curve(model, x_test_data, y_test_data):
y_pred_prob = model.predict_proba(x_test_data)[:,1]
fpr, tpr, tr = roc_curve(y_test_data, y_pred_prob)
rocauc = roc_auc_score(y_test_data, y_pred_prob)
print('ROC Area Under Curve (ROC_AUC): ',rocauc)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'r', label='AUC = %.3f'% rocauc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'b--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
def feature_importance(model, feat, model_name):
plt.figure(figsize = (9,5))
feat_import = pd.DataFrame({'Feature': feat, 'Feature importance': model.feature_importances_})
feat_import = feat_import.sort_values(by='Feature importance',ascending=False)
g = sns.barplot(x='Feature',y='Feature importance',data=feat_import)
g.set_xticklabels(g.get_xticklabels(),rotation=90)
g.set_title('Features importance - '+model_name,fontsize=20)
plt.show()
dataset = dataset.drop('Time', axis=1)
# Since the amount features has outliers we will use robust scaling instead of standard scaling
rob_amt = RobustScaler()
dataset['Amount'] = rob_amt.fit_transform(dataset['Amount'].values.reshape(-1,1))
X = dataset.drop(['Class'],axis=1)
Y = dataset.iloc[:,-1:]
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, stratify = Y, random_state=0)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, stratify=y_train, test_size=0.3, random_state=0)
print(x_train.shape)
print(x_test.shape)
print(x_val.shape)
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)
#distribution of classes across train and test data
print('the number of fraudulent transactions in train data is', y_train['Class'].value_counts()[1], 'and non fraudulent transactions is', y_train['Class'].value_counts()[0])
print('the number of fraudulent transactions in test data is', y_test['Class'].value_counts()[1], 'and non fraudulent transactions is', y_test['Class'].value_counts()[0])
print('the number of fraudulent transactions in validation data is', y_val['Class'].value_counts()[1], 'and non fraudulent transactions is', y_val['Class'].value_counts()[0])
param = {'alpha':[0.000001,0.00001,0.0001, 0.001, 0.01, 0.1]}
paramGrid = ParameterGrid(param)
bestModel, bestScore, allModels, allScores = pf.bestFit(SGDClassifier(loss='log', random_state=0), paramGrid,
x_train, y_train, x_val, y_val, metric = f1_score, scoreLabel = "f1_score")
print(bestModel, bestScore)
y_pred= bestModel.predict(x_test)
plot_confusion_matrix(y_test,y_pred)
plot_precison_recall_curve(bestModel, x_test, y_test)
params= {
'max_depth':[3,5,7,9,10,12],
'n_estimators':[10,20,30,60,80,100]
}
paramGrid = ParameterGrid(params)
bestModel, bestScore, allModels, allScores = pf.bestFit(RandomForestClassifier(random_state=0, n_jobs=-1, criterion='entropy'),
paramGrid,x_train, y_train, x_val, y_val, nfolds =5,
metric = f1_score, scoreLabel = "f1_score", n_jobs=-1)
print(bestModel, bestScore)
y_pred= bestModel.predict(x_test)
plot_confusion_matrix(y_test,y_pred)
plot_precison_recall_curve(bestModel, x_test, y_test)
from prettytable import PrettyTable
x = PrettyTable()
x.title="Model Comparison"
x.field_names = ["Model","f1_score","Accuracy","Precision", "Recall"]
x.add_row(["Hyperparameter tuned Logistic Regression",72.38,99.91,80.83,65.54 ])
x.add_row(["Hyperparameter tuned Random Forest", 82.62, 99.94, 96.39, 72.29])
print(x)
# we will sample only the train data
data = x_train
data['Class'] = y_train
fraud = data[data['Class'] == 1]
non_fraud = data[data['Class'] == 0].sample(361, random_state=0)
non_fraud.reset_index(drop=True, inplace=True)
fraud.reset_index(drop=True, inplace=True)
data = pd.concat([non_fraud, fraud]).sample(frac=1).reset_index(drop=True)
data.describe()
print('Total data points in the dataset',len(x_train))
print('-'*56)
sns.countplot(data['Class'])
plt.show()
print('-'*80)
print("Number of Safe transactions (CLASS: 0)", data['Class'].value_counts()[0], ", (", (data['Class'].value_counts()[0]/(data['Class'].value_counts()[0]+data['Class'].value_counts()[1]))*100,"%)")
print("Number of fraud transactions (CLASS: 1)", data['Class'].value_counts()[1], ", (", (data['Class'].value_counts()[1]/(data['Class'].value_counts()[0]+data['Class'].value_counts()[1]))*100,"%)")
x_train = data.drop('Class', axis=1)
y_train = data.iloc[:,-1:]
#class distribution
print('the number of fraudulent transactions in train data is', y_train['Class'].value_counts()[1], 'and non fraudulent transactions is', y_train['Class'].value_counts()[0])
print('the number of fraudulent transactions in test data is', y_test['Class'].value_counts()[1], 'and non fraudulent transactions is', y_test['Class'].value_counts()[0])
print('the number of fraudulent transactions in validation data is', y_val['Class'].value_counts()[1], 'and non fraudulent transactions is', y_val['Class'].value_counts()[0])
from sklearn.preprocessing import StandardScaler
cols = data.columns
std = StandardScaler()
X_std = std.fit_transform(data[cols].iloc[:,range(len(data.columns))].values)
cov_mat =np.cov(X_std.T)
plt.figure(figsize=(20,16))
sns.set(font_scale=1.5)
hm = sns.heatmap(cov_mat, annot=True, square=True, fmt='.2f', annot_kws={'size': 12}, cmap='coolwarm',
yticklabels=cols, xticklabels=cols)
plt.title('Covariance matrix showing correlation coefficients for new Sampled data', size = 18)
plt.tight_layout()
plt.show()
important features from the covariance matrix based on how highly these features correlate with the class
df = pd.DataFrame(cov_mat, columns=data.columns )
imp_feats = data.columns[np.abs(df["Class"]) > 0.5]
imp_feats
features = x_train
labels = y_train
pca = PCA(n_components=2, random_state = 0)
X_pca_embed = pca.fit_transform(features.values)
for_pca = np.hstack((X_pca_embed, labels.values.reshape(-1,1)))
for_pca_df = pd.DataFrame(data=for_pca, columns=['Dimension_x','Dimension_y','Class'])
plt.figure(figsize=(15,10))
sns.scatterplot(x='Dimension_x', y='Dimension_y', hue='Class', data=for_pca_df, s=80)
plt.title('PCA for reduced data features')
plt.legend()
plt.show()
tsne = TSNE(n_components=2, perplexity=50, learning_rate=200, random_state = 0)
X_tsne_embed = tsne.fit_transform(features.values)
for_tsne = np.hstack((X_tsne_embed, labels.values.reshape(-1,1)))
for_tsne_df = pd.DataFrame(data=for_tsne, columns=['Dimension_x','Dimension_y','Class'])
plt.figure(figsize=(15,10))
sns.scatterplot(x='Dimension_x', y='Dimension_y', hue='Class', data=for_tsne_df, s=80)
plt.title('TSNE for reduced data features with perplexity = 50')
plt.legend()
plt.show()
nspec_emb = SpectralEmbedding(n_components=2, affinity='nearest_neighbors', n_neighbors=30)
X_nspec_embed = nspec_emb.fit_transform(features.values)
for_nspec = np.hstack((X_nspec_embed, labels.values.reshape(-1,1)))
for_nspec_df = pd.DataFrame(data=for_nspec, columns=['Dimension_x','Dimension_y','Class'])
plt.figure(figsize=(15,10))
sns.scatterplot(x='Dimension_x', y='Dimension_y', hue='Class', data=for_nspec_df, s=80)
plt.title('Spectral Embedding for reduced data features with n_neighbors = 30')
plt.legend()
plt.show()
param={'alpha':[0.000001,0.00001,0.0001, 0.001, 0.01, 0.1]}
paramGrid = ParameterGrid(param)
bestModel, bestScore, allModels, allScores = pf.bestFit(SGDClassifier(loss='log', random_state=0), paramGrid,
x_train, y_train, x_val, y_val, metric = f1_score, scoreLabel = "f1_score", n_jobs=-1, nfolds=5)
print(bestModel, bestScore)
y_pred= bestModel.predict(x_test)
plot_confusion_matrix(y_test,y_pred)
plot_precison_recall_curve(bestModel, x_test, y_test)
param_grid = {
'max_depth':[3,5,7,9,10,12],
'n_estimators':[10,20,30,60,80,100]
}
# Creating the classifier
paramGrid = ParameterGrid(param_grid)
bestModel, bestScore, allModels, allScores = pf.bestFit(RandomForestClassifier(criterion ='entropy',random_state=0, n_jobs=4),
paramGrid, x_train, y_train, x_val, y_val, metric = f1_score,
scoreLabel = "f1_score", n_jobs=-1, nfolds =5)
print(bestModel, bestScore)
y_pred= bestModel.predict(x_test)
plot_confusion_matrix(y_test,y_pred)
plot_precison_recall_curve(bestModel, x_test, y_test)
feats = list(x_train.columns)
feature_importance(bestModel, feats, 'Random Forest')
param_grid = {'n_estimators':[3,5,10,15,20,30],
'min_child_weight':[1,2,3,5,7]}
paramGrid = ParameterGrid(param_grid)
bestModel, bestScore, allModels, allScores = pf.bestFit(xgb.XGBClassifier(random_state=0, n_jobs=4), paramGrid,
x_train, y_train, x_val, y_val, metric = f1_score, scoreLabel = "f1_score", n_jobs=-1, nfolds =5)
print(bestModel, bestScore)
y_pred= bestModel.predict(x_test)
plot_confusion_matrix(y_test,y_pred)
plot_precison_recall_curve(bestModel, x_test, y_test)
Feature Importance
feature_importance(bestModel, feats, 'XG Boost')
input_dim = x_train.shape[1]
output_dim = 1
Early_stop= keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=8, verbose=0, mode='min',
restore_best_weights= True)
model1 = Sequential()
model1.add(Dense(128, activation='relu', input_shape=(input_dim,), kernel_initializer=he_normal(seed=None)))
model1.add(BatchNormalization())
model1.add(Dropout(0.3))
model1.add(Dense(64, activation='relu', kernel_initializer=he_normal(seed=None)) )
model1.add(BatchNormalization())
model1.add(Dropout(0.3))
model1.add(Dense(32, activation='relu', kernel_initializer=he_normal(seed=None)) )
model1.add(BatchNormalization())
model1.add(Dropout(0.3))
model1.add(Dense(output_dim, activation='sigmoid'))
model1.save('model1.h5')
model1.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])
model1.summary()
plot_model(model1, to_file='model1.png')
td = model_to_dot(model1, dpi=65)
td.set_size('60x12')
SVG(td.create(prog='dot', format='svg'))
train_model1 = model1.fit(x_train, y_train, validation_data=(x_val,y_val), batch_size=100, epochs=50,
shuffle=False, callbacks=[Early_stop])
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import iplot
def create_trace(x,y,ylabel,color):
trace = go.Scatter(
x = x,y = y,
name=ylabel,
marker=dict(color=color),
mode = "markers+lines",
text=x
)
return trace
def plot_accuracy_and_loss(train_model):
hist = train_model.history
acc = hist['acc']
val_acc = hist['val_acc']
loss = hist['loss']
val_loss = hist['val_loss']
epochs = list(range(1,len(acc)+1))
trace_ta = create_trace(epochs,acc,"Training accuracy", "Green")
trace_va = create_trace(epochs,val_acc,"Validation accuracy", "Red")
trace_tl = create_trace(epochs,loss,"Training loss", "Blue")
trace_vl = create_trace(epochs,val_loss,"Validation loss", "Magenta")
fig = make_subplots(rows=1,cols=2, subplot_titles=('Training and validation accuracy',
'Training and validation loss'))
fig.append_trace(trace_ta,1,1)
fig.append_trace(trace_va,1,1)
fig.append_trace(trace_tl,1,2)
fig.append_trace(trace_vl,1,2)
fig['layout']['xaxis'].update(title = 'Epoch')
fig['layout']['xaxis2'].update(title = 'Epoch')
fig['layout']['yaxis'].update(title = 'Accuracy', range=[acc[0]-0.02,1])
fig['layout']['yaxis2'].update(title = 'Loss', range=[0,val_loss[-1]+0.8])
iplot(fig, filename='accuracy-loss')
plot_accuracy_and_loss(train_model1)
w_after = model1.get_weights()
h1_w = w_after[0].flatten().reshape(-1,1)
h2_w = w_after[2].flatten().reshape(-1,1)
h3_w = w_after[4].flatten().reshape(-1,1)
out_w = w_after[6].flatten().reshape(-1,1)
fig = plt.figure()
plt.title("Weight matrices after model trained")
plt.subplot(1, 4, 1)
plt.title("Trained model Weights")
ax = sns.violinplot(y=h1_w,color='b')
plt.xlabel('Hidden Layer 1')
plt.title("Weight matrices after model trained")
plt.subplot(1, 4, 2)
plt.title("Trained model Weights")
ax = sns.violinplot(y=h2_w,color='b')
plt.xlabel('Hidden Layer 2')
plt.title("Weight matrices after model trained")
plt.subplot(1, 4, 3)
plt.title("Trained model Weights")
ax = sns.violinplot(y=h3_w,color='b')
plt.xlabel('Hidden Layer 3')
plt.subplot(1, 4, 4)
plt.title("Trained model Weights")
ax = sns.violinplot(y=out_w,color='y')
plt.xlabel('Output Layer ')
plt.show()
score = model1.evaluate(x_test, y_test)
print('Test score:', score[0])
print('Test accuracy:', score[1])
y_pred = model1.predict(x_test)
y_test = pd.DataFrame(y_test)
y_pred = pd.DataFrame(y_pred)
plot_confusion_matrix(y_test, y_pred.round())
from prettytable import PrettyTable
x = PrettyTable()
x.title="Model Comparison"
x.field_names = ["Model","f1_score","Accuracy","Precision", "Recall"]
x.add_row(["Hyperparameter tuned Logistic Regression",17.13,98.57,9.52,85.13])
x.add_row(["Hyperparameter tuned Random Forest", 25.96, 99.16, 15.33, 84.45])
x.add_row(["Hyperparameter tuned XG Boost", 14.05, 98.16, 7.64, 86.48])
x.add_row(["2 Hidden layer MLP, keras",13.37, 97.99, 7.22, 89.18])
print(x)
condensed nearestneighbor it takes into consideration the nearest neighbors to sample the majority class#we undersample only the train data
from imblearn.under_sampling import CondensedNearestNeighbour
undersample = CondensedNearestNeighbour(n_neighbors=3)
x_usample, y_usample = undersample.fit_resample(x_train.values, y_train.values)
counter = Counter(y_train.values.ravel())
print(counter)
counter = Counter(y_usample)
print(counter)
x_train = pd.DataFrame(x_usample)
y_train = pd.DataFrame(y_usample)
print('the number of fraudulent transactions in train data is', y_train[0].value_counts()[1], 'and non fraudulent transactions is', y_train[0].value_counts()[0])
print('the number of fraudulent transactions in test data is', y_test['Class'].value_counts()[1], 'and non fraudulent transactions is', y_test['Class'].value_counts()[0])
print('the number of fraudulent transactions in validation data is', y_val['Class'].value_counts()[1], 'and non fraudulent transactions is', y_val['Class'].value_counts()[0])
features = x_train
labels = y_train
pca = PCA(n_components=2, random_state = 0)
X_pca_embed = pca.fit_transform(features.values)
for_pca = np.hstack((X_pca_embed, labels.values.reshape(-1,1)))
for_pca_df = pd.DataFrame(data=for_pca, columns=['Dimension_x','Dimension_y','Class'])
plt.figure(figsize=(15,10))
sns.scatterplot(x='Dimension_x', y='Dimension_y', hue='Class', data=for_pca_df, s=80)
plt.title('PCA for reduced data features')
plt.legend()
plt.show()
tsne = TSNE(n_components=2, perplexity=50, learning_rate=200, random_state = 0)
X_tsne_embed = tsne.fit_transform(features.values)
for_tsne = np.hstack((X_tsne_embed, labels.values.reshape(-1,1)))
for_tsne_df = pd.DataFrame(data=for_tsne, columns=['Dimension_x','Dimension_y','Class'])
plt.figure(figsize=(15,10))
sns.scatterplot(x='Dimension_x', y='Dimension_y', hue='Class', data=for_tsne_df, s=80)
plt.title('TSNE for reduced data features with perplexity = 50')
plt.legend()
plt.show()
condesed nearest neighbor algorithm has preserved the neighborhood relationship between the classes while sampling the majority classfrom sklearn.neighbors import KNeighborsClassifier
param={'n_neighbors':[2, 3, 4, 5, 6, 8],
'algorithm':['ball_tree','kd_tree','brute'],
'leaf_size':[5,10,20,30,40,50]}
paramGrid = ParameterGrid(param)
bestModel, bestScore, allModels, allScores = pf.bestFit(KNeighborsClassifier(n_jobs=4), paramGrid,
x_train, y_train, x_val, y_val, metric = f1_score, scoreLabel = "f1_score", n_jobs=-1, nfolds=5)
print(bestModel, bestScore)
y_pred= bestModel.predict(x_test)
plot_confusion_matrix(y_test,y_pred)
plot_precison_recall_curve(bestModel, x_test, y_test)
param={'alpha':[0.000001,0.00001,0.0001, 0.001, 0.01, 0.1]}
paramGrid = ParameterGrid(param)
bestModel, bestScore, allModels, allScores = pf.bestFit(SGDClassifier(loss='log', random_state=0), paramGrid,
x_train, y_train, x_val, y_val, metric = f1_score, scoreLabel = "f1_score", n_jobs=-1, nfolds=5)
print(bestModel, bestScore)
y_pred= bestModel.predict(x_test)
plot_confusion_matrix(y_test,y_pred)
plot_precison_recall_curve(bestModel, x_test, y_test)
param_grid = {
'max_depth':[3,5,7,9,10,12],
'min_samples_leaf':[1, 3, 5, 7, 10, 12]
}
paramGrid = ParameterGrid(param_grid)
bestModel, bestScore, allModels, allScores = pf.bestFit(RandomForestClassifier(criterion='entropy',random_state=0, n_jobs=4),
paramGrid, x_train, y_train, x_val, y_val, metric = f1_score,
scoreLabel = "f1_score", n_jobs=-1, nfolds=5)
print(bestModel, bestScore)
y_pred= bestModel.predict(x_test)
plot_confusion_matrix(y_test,y_pred)
plot_precison_recall_curve(bestModel, x_test, y_test)
Feature Importance
feature_importance(bestModel, feats, 'Random Forest')
param_grid = {'max_depth':[3, 5, 7, 9, 12],
'min_child_weight':[1,2,3,5,7]}
paramGrid = ParameterGrid(param_grid)
bestModel, bestScore, allModels, allScores = pf.bestFit(xgb.XGBClassifier(random_state=0, n_jobs=4), paramGrid,
x_train, y_train, x_val, y_val, metric = f1_score, scoreLabel = "f1_score", n_jobs=-1, nfolds=5)
print(bestModel, bestScore)
y_pred= bestModel.predict(x_test)
plot_confusion_matrix(y_test,y_pred)
plot_precison_recall_curve(bestModel, x_test, y_test)
Feature Importance
feature_importance(bestModel, feats, 'XGBoost')
Early_stop= keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=8,verbose=0, mode='min',
restore_best_weights= True)
model1 = Sequential()
model1.add(Dense(128, activation='relu', input_shape=(input_dim,), kernel_initializer=he_normal(seed=None)))
model1.add(BatchNormalization())
model1.add(Dropout(0.3))
model1.add(Dense(64, activation='relu', kernel_initializer=he_normal(seed=None)) )
model1.add(BatchNormalization())
model1.add(Dropout(0.3))
model1.add(Dense(32, activation='relu', kernel_initializer=he_normal(seed=None)) )
model1.add(BatchNormalization())
model1.add(Dropout(0.3))
model1.add(Dense(output_dim, activation='sigmoid'))
model1.save('model2.h5')
model1.compile(keras.optimizers.Adam(lr=0.001), loss = 'binary_crossentropy', metrics=['accuracy'])
model1.summary()
plot_model(model1, to_file='model1.png')
td = model_to_dot(model1, dpi=65)
td.set_size('60x12')
SVG(td.create(prog='dot', format='svg'))
train_model1 = model1.fit(x_train, y_train, validation_data=(x_val,y_val), batch_size=32, epochs=50, callbacks=[Early_stop])
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import iplot
def create_trace(x,y,ylabel,color):
trace = go.Scatter(
x = x,y = y,
name=ylabel,
marker=dict(color=color),
mode = "markers+lines",
text=x
)
return trace
def plot_accuracy_and_loss(train_model):
hist = train_model.history
acc = hist['acc']
val_acc = hist['val_acc']
loss = hist['loss']
val_loss = hist['val_loss']
epochs = list(range(1,len(acc)+1))
trace_ta = create_trace(epochs,acc,"Training accuracy", "Green")
trace_va = create_trace(epochs,val_acc,"Validation accuracy", "Red")
trace_tl = create_trace(epochs,loss,"Training loss", "Blue")
trace_vl = create_trace(epochs,val_loss,"Validation loss", "Magenta")
fig = make_subplots(rows=1,cols=2, subplot_titles=('Training and validation accuracy',
'Training and validation loss'))
fig.append_trace(trace_ta,1,1)
fig.append_trace(trace_va,1,1)
fig.append_trace(trace_tl,1,2)
fig.append_trace(trace_vl,1,2)
fig['layout']['xaxis'].update(title = 'Epoch')
fig['layout']['xaxis2'].update(title = 'Epoch')
fig['layout']['yaxis'].update(title = 'Accuracy', range=[acc[0]-0.007,1])
fig['layout']['yaxis2'].update(title = 'Loss', range=[0,val_loss[-1]+1])
iplot(fig, filename='accuracy-loss')
plot_accuracy_and_loss(train_model1)
w_after = model1.get_weights()
h1_w = w_after[0].flatten().reshape(-1,1)
h2_w = w_after[2].flatten().reshape(-1,1)
out_w = w_after[4].flatten().reshape(-1,1)
fig = plt.figure()
plt.title("Weight matrices after model trained")
plt.subplot(1, 3, 1)
plt.title("Trained model Weights")
ax = sns.violinplot(y=h1_w,color='b')
plt.xlabel('Hidden Layer 1')
plt.title("Weight matrices after model trained")
plt.subplot(1, 3, 2)
plt.title("Trained model Weights")
ax = sns.violinplot(y=h2_w,color='b')
plt.xlabel('Hidden Layer 2')
plt.subplot(1, 3, 3)
plt.title("Trained model Weights")
ax = sns.violinplot(y=out_w,color='y')
plt.xlabel('Output Layer ')
plt.show()
score = model1.evaluate(x_test, y_test)
print('Test score:', score[0])
print('Test accuracy:', score[1])
y_pred = model1.predict(x_test)
y_test = pd.DataFrame(y_test)
y_pred = pd.DataFrame(y_pred)
plot_confusion_matrix(y_test, y_pred.round())
from prettytable import PrettyTable
x = PrettyTable()
x.title="Model Comparison"
x.field_names = ["Model","f1_score","Accuracy","Precision", "Recall"]
x.add_row(["Hyperparameter tuned KNearest Neighbor",83.20, 99.94, 95.61, 73.64])
x.add_row(["Hyperparameter tuned Logistic Regression", 49.42, 99.74, 37.54, 72.29])
x.add_row(["Hyperparameter tuned Random Forest", 85.60, 99.95, 94.30, 78.37])
x.add_row(["Hyperparameter tuned XG Boost", 73.48, 99.90, 69.69, 77.70])
x.add_row(["2 Hidden layers MLP, keras",67.44, 99.86, 59.18, 78.37])
print(x)
# Train-test split
X = dataset.drop(['Class'],axis=1)
Y = dataset.iloc[:,-1:]
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, stratify = Y, random_state=0)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, stratify=y_train, test_size=0.3, random_state=0)
smote = SMOTE(random_state = 0)
print("normal data distribution: {}".format(Counter(y_train.values.ravel())))
X_sample, y_sample = smote.fit_sample(x_train, y_train.values.ravel())
print("SMOTE data distribution: {}".format(Counter(y_sample)))
x_train = pd.DataFrame(X_sample)
y_train = pd.DataFrame(y_sample)
print('the number of fraudulent transactions in train data is', y_train[0].value_counts()[1], 'and non fraudulent transactions is', y_train[0].value_counts()[0])
print('the number of fraudulent transactions in test data is', y_test['Class'].value_counts()[1], 'and non fraudulent transactions is', y_test['Class'].value_counts()[0])
print('the number of fraudulent transactions in validation data is', y_val['Class'].value_counts()[1], 'and non fraudulent transactions is', y_val['Class'].value_counts()[0])
param={'alpha':[0.000001,0.00001,0.0001, 0.001, 0.01, 0.1]}
paramGrid = ParameterGrid(param)
bestModel, bestScore, allModels, allScores = pf.bestFit(SGDClassifier(loss='log', random_state=0), paramGrid,
x_train, y_train, x_val, y_val, metric = f1_score, scoreLabel = "f1_score", n_jobs=-1, nfolds=5)
print(bestModel, bestScore)
y_pred= bestModel.predict(x_test)
plot_confusion_matrix(y_test,y_pred)
plot_precison_recall_curve(bestModel, x_test, y_test)
param_grid = {
'max_depth':[3,5,7,9,10,12],
'min_samples_leaf':[1, 3, 5, 7, 10, 12]
}
paramGrid = ParameterGrid(param_grid)
bestModel, bestScore, allModels, allScores = pf.bestFit(RandomForestClassifier(criterion='entropy',random_state=0, n_jobs=4),
paramGrid, x_train, y_train, x_val, y_val, metric = f1_score,
scoreLabel = "f1_score", n_jobs=-1, nfolds=5)
print(bestModel, bestScore)
y_pred= bestModel.predict(x_test)
plot_confusion_matrix(y_test,y_pred)
plot_precison_recall_curve(bestModel, x_test, y_test)
Feature Importance
feature_importance(bestModel, feats, 'Random Forest')
param_grid = {'max_depth':[3, 5, 7, 9, 12],
'min_child_weight':[1,2,3,5,7]}
paramGrid = ParameterGrid(param_grid)
bestModel, bestScore, allModels, allScores = pf.bestFit(xgb.XGBClassifier(random_state=0, n_jobs=4), paramGrid,
x_train, y_train, x_val, y_val, metric = f1_score, scoreLabel = "f1_score", n_jobs=-1, nfolds=5)
print(bestModel, bestScore)
y_pred= bestModel.predict(x_test)
plot_confusion_matrix(y_test,y_pred)
plot_precison_recall_curve(bestModel, x_test, y_test)
Feature Importance
feature_importance(bestModel, feats, 'XGBoost')
Early_stop= keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=10,verbose=0, mode='min',
restore_best_weights= True)
model1 = Sequential()
model1.add(Dense(256, activation='relu', input_shape=(input_dim,), kernel_initializer=he_normal(seed=None)))
model1.add(BatchNormalization())
model1.add(Dropout(0.3))
model1.add(Dense(128, activation='relu', kernel_initializer=he_normal(seed=None)) )
model1.add(BatchNormalization())
model1.add(Dropout(0.3))
model1.add(Dense(64, activation='relu', kernel_initializer=he_normal(seed=None)) )
model1.add(BatchNormalization())
model1.add(Dropout(0.3))
model1.add(Dense(32, activation='relu', kernel_initializer=he_normal(seed=None)) )
model1.add(BatchNormalization())
model1.add(Dropout(0.3))
model1.add(Dense(output_dim, activation='sigmoid'))
model1.save('model2.h5')
model1.compile(keras.optimizers.Adam(lr=0.001), loss = 'binary_crossentropy', metrics=['accuracy'])
model1.summary()
plot_model(model1, to_file='model1.png')
td = model_to_dot(model1, dpi=65)
td.set_size('60x12')
SVG(td.create(prog='dot', format='svg'))
train_model1 = model1.fit(x_train, y_train, validation_data=(x_val,y_val), batch_size=400, epochs=80,
shuffle=True, callbacks=[Early_stop])
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import iplot
def create_trace(x,y,ylabel,color):
trace = go.Scatter(
x = x,y = y,
name=ylabel,
marker=dict(color=color),
mode = "markers+lines",
text=x
)
return trace
def plot_accuracy_and_loss(train_model):
hist = train_model.history
acc = hist['acc']
val_acc = hist['val_acc']
loss = hist['loss']
val_loss = hist['val_loss']
epochs = list(range(1,len(acc)+1))
trace_ta = create_trace(epochs,acc,"Training accuracy", "Green")
trace_va = create_trace(epochs,val_acc,"Validation accuracy", "Red")
trace_tl = create_trace(epochs,loss,"Training loss", "Blue")
trace_vl = create_trace(epochs,val_loss,"Validation loss", "Magenta")
fig = make_subplots(rows=1,cols=2, subplot_titles=('Training and validation accuracy',
'Training and validation loss'))
fig.append_trace(trace_ta,1,1)
fig.append_trace(trace_va,1,1)
fig.append_trace(trace_tl,1,2)
fig.append_trace(trace_vl,1,2)
fig['layout']['xaxis'].update(title = 'Epoch')
fig['layout']['xaxis2'].update(title = 'Epoch')
fig['layout']['yaxis'].update(title = 'Accuracy', range=[acc[0]-0.007,1])
fig['layout']['yaxis2'].update(title = 'Loss', range=[0,val_loss[-1]+0.08])
iplot(fig, filename='accuracy-loss')
plot_accuracy_and_loss(train_model1)
w_after = model1.get_weights()
h1_w = w_after[0].flatten().reshape(-1,1)
h2_w = w_after[2].flatten().reshape(-1,1)
h3_w = w_after[4].flatten().reshape(-1,1)
h4_w = w_after[6].flatten().reshape(-1,1)
out_w = w_after[8].flatten().reshape(-1,1)
fig = plt.figure()
plt.title("Weight matrices after model trained")
plt.subplot(1, 5, 1)
plt.title("Trained model Weights")
ax = sns.violinplot(y=h1_w,color='b')
plt.xlabel('Hidden Layer 1')
plt.title("Weight matrices after model trained")
plt.subplot(1, 5, 2)
plt.title("Trained model Weights")
ax = sns.violinplot(y=h2_w,color='b')
plt.xlabel('Hidden Layer 2')
plt.title("Weight matrices after model trained")
plt.subplot(1, 5, 3)
plt.title("Trained model Weights")
ax = sns.violinplot(y=h3_w,color='b')
plt.xlabel('Hidden Layer 3')
plt.title("Weight matrices after model trained")
plt.subplot(1, 5, 4)
plt.title("Trained model Weights")
ax = sns.violinplot(y=h4_w,color='b')
plt.xlabel('Hidden Layer 4')
plt.subplot(1, 5, 5)
plt.title("Trained model Weights")
ax = sns.violinplot(y=out_w,color='y')
plt.xlabel('Output Layer ')
plt.show()
score = model1.evaluate(x_test, y_test)
print('Test score:', score[0])
print('Test accuracy:', score[1])
y_pred = model1.predict(x_test)
y_test = pd.DataFrame(y_test)
y_pred = pd.DataFrame(y_pred)
plot_confusion_matrix(y_test, y_pred.round())
from prettytable import PrettyTable
x = PrettyTable()
x.title="Model Comparison"
x.field_names = ["Model","f1_score","Accuracy","Precision", "Recall"]
x.add_row(["Hyperparameter tuned Logistic Regression",17.83, 98.62, 9.94, 86.48])
x.add_row(["Hyperparameter tuned Random Forest", 69.83, 99.87, 59.52, 84.45])
x.add_row(["Hyperparameter tuned XG Boost", 78.17, 99.92, 75.47, 81.08])
x.add_row(["3 Hidden layer MLP, keras",78.72, 99.92, 82.83, 75.0])
print(x)
#Train-test split
X = dataset.drop(['Class'],axis=1)
Y = dataset.iloc[:,-1:]
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, stratify = Y, random_state=0)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, stratify=y_train, test_size=0.3, random_state=0)
adasyn = ADASYN(random_state = 0)
print("normal data distribution: {}".format(Counter(y_train.values.ravel())))
X_sample, y_sample = adasyn.fit_sample(x_train, y_train.values.ravel())
print("SMOTE data distribution: {}".format(Counter(y_sample)))
x_train = pd.DataFrame(X_sample)
y_train = pd.DataFrame(y_sample)
print('the number of fraudulent transactions in train data is', y_train[0].value_counts()[1], 'and non fraudulent transactions is', y_train[0].value_counts()[0])
print('the number of fraudulent transactions in test data is', y_test['Class'].value_counts()[1], 'and non fraudulent transactions is', y_test['Class'].value_counts()[0])
print('the number of fraudulent transactions in validation data is', y_val['Class'].value_counts()[1], 'and non fraudulent transactions is', y_val['Class'].value_counts()[0])
param={'alpha':[0.000001,0.00001,0.0001, 0.001, 0.01, 0.1]}
paramGrid = ParameterGrid(param)
bestModel, bestScore, allModels, allScores = pf.bestFit(SGDClassifier(loss='log', random_state=0), paramGrid,
x_train, y_train, x_val, y_val, metric = f1_score, scoreLabel = "f1_score", n_jobs=-1, nfolds=5)
print(bestModel, bestScore)
y_pred= bestModel.predict(x_test)
plot_confusion_matrix(y_test,y_pred)
plot_precison_recall_curve(bestModel, x_test, y_test)
param_grid = {
'max_depth':[3,5,7,9,10,12],
'min_samples_leaf':[1, 3, 5, 7, 10, 12]
}
paramGrid = ParameterGrid(param_grid)
bestModel, bestScore, allModels, allScores = pf.bestFit(RandomForestClassifier(criterion='entropy',random_state=0, n_jobs=4),
paramGrid, x_train, y_train, x_val, y_val, metric = f1_score,
scoreLabel = "f1_score", n_jobs=-1, nfolds=5)
print(bestModel, bestScore)
y_pred= bestModel.predict(x_test)
plot_confusion_matrix(y_test,y_pred)
plot_precison_recall_curve(bestModel, x_test, y_test)
Feature Importance
feature_importance(bestModel, feats, 'Random Forest')
param_grid = {'max_depth':[3, 5, 7, 9, 12],
'min_child_weight':[1,2,3,5,7]}
paramGrid = ParameterGrid(param_grid)
bestModel, bestScore, allModels, allScores = pf.bestFit(xgb.XGBClassifier(random_state=0, n_jobs=4), paramGrid,
x_train, y_train, x_val, y_val, metric = f1_score, scoreLabel = "f1_score", n_jobs=-1, nfolds=5)
print(bestModel, bestScore)
y_pred= bestModel.predict(x_test)
plot_confusion_matrix(y_test,y_pred)
plot_precison_recall_curve(bestModel, x_test, y_test)
Feature Importance
feature_importance(bestModel, feats, 'XG Boost')
Early_stop= keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=10,verbose=0, mode='min',
restore_best_weights= True)
model1 = Sequential()
model1.add(Dense(256, activation='relu', input_shape=(input_dim,), kernel_initializer=he_normal(seed=None)))
model1.add(BatchNormalization())
model1.add(Dropout(0.3))
model1.add(Dense(128, activation='relu', kernel_initializer=he_normal(seed=None)) )
model1.add(BatchNormalization())
model1.add(Dropout(0.3))
model1.add(Dense(64, activation='relu', kernel_initializer=he_normal(seed=None)) )
model1.add(BatchNormalization())
model1.add(Dropout(0.3))
model1.add(Dense(32, activation='relu', kernel_initializer=he_normal(seed=None)) )
model1.add(BatchNormalization())
model1.add(Dropout(0.3))
model1.add(Dense(output_dim, activation='sigmoid'))
model1.save('model3.h5')
model1.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])
model1.summary()
plot_model(model1, to_file='model1.png')
td = model_to_dot(model1, dpi=65)
td.set_size('60x12')
SVG(td.create(prog='dot', format='svg'))
train_model1 = model1.fit(x_train, y_train, validation_data=(x_val,y_val), batch_size=400, epochs=80,
shuffle=True, callbacks=[Early_stop])
plot_accuracy_and_loss(train_model1)
w_after = model1.get_weights()
h1_w = w_after[0].flatten().reshape(-1,1)
h2_w = w_after[2].flatten().reshape(-1,1)
h3_w = w_after[4].flatten().reshape(-1,1)
h4_w = w_after[6].flatten().reshape(-1,1)
out_w = w_after[8].flatten().reshape(-1,1)
fig = plt.figure()
plt.title("Weight matrices after model trained")
plt.subplot(1, 5, 1)
plt.title("Trained model Weights")
ax = sns.violinplot(y=h1_w,color='b')
plt.xlabel('Hidden Layer 1')
plt.title("Weight matrices after model trained")
plt.subplot(1, 5, 2)
plt.title("Trained model Weights")
ax = sns.violinplot(y=h2_w,color='b')
plt.xlabel('Hidden Layer 2')
plt.title("Weight matrices after model trained")
plt.subplot(1, 5, 3)
plt.title("Trained model Weights")
ax = sns.violinplot(y=h3_w,color='b')
plt.xlabel('Hidden Layer 3')
plt.title("Weight matrices after model trained")
plt.subplot(1, 5, 4)
plt.title("Trained model Weights")
ax = sns.violinplot(y=h4_w,color='b')
plt.xlabel('Hidden Layer 4')
plt.subplot(1, 5, 5)
plt.title("Trained model Weights")
ax = sns.violinplot(y=out_w,color='y')
plt.xlabel('Output Layer ')
plt.show()
score = model1.evaluate(x_test, y_test)
print('Test score:', score[0])
print('Test accuracy:', score[1])
y_pred = model1.predict(x_test)
y_test = pd.DataFrame(y_test)
y_pred = pd.DataFrame(y_pred)
plot_confusion_matrix(y_test, y_pred.round())
from prettytable import PrettyTable
x = PrettyTable()
x.title="Model Comparison"
x.field_names = ["Model","f1_score","Accuracy","Precision", "Recall"]
x.add_row(["Hyperparameter tuned Logistic Regression",5.48, 94.55, 2.82, 91.21])
x.add_row(["Hyperparameter tuned Random Forest", 36.83, 99.51, 23.72, 81.75])
x.add_row(["Hyperparameter tuned XG Boost", 80.13, 99.92, 78.57, 81.75])
x.add_row(["3 Hidden layer MLP, keras",78.62, 99.92, 80.28, 77.02])
print(x)